This is a case study for Google Data Analyst Certificate:
Bellabeat is a high-tech company that manufactures health-focused smart products. It is a successful small company, but they have the potential to become a larger player in the global smart device market. Since it was founded in 2013, Bellabeat has grown rapidly and quickly positioned itself as a tech-driven wellness company for women
Our task is to focus on a Bellabeat product and analyze smart device usage data in order to gain insight into how people are already using the smart devices. Then, using this information, we need to recommend how these trends can help Bellabeat marketing strategy.
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse) #wrangle data
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr) #clean data
library(lubridate) #wrangle date attributes
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(skimr) #get summary data
library(ggplot2) #visualize data
library(cowplot) #grid the plot
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
library(readr) #save csv
library(tidyr) #for organizing tabular data
library(janitor) #for data examination and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
path <- file.path("C:","Users","Lenovo","Documents","Bellabeat","Fitabase Data 4.12.16-5.12.16")
setwd(path)
daily_activity <- read.csv("dailyActivity_merged.csv")
sleep_day <- read.csv("sleepDay_merged.csv")
weight <- read.csv("weightLogInfo_merged.csv")
hourly_steps <- read.csv("hourlySteps_merged.csv")
Examining the first few rows of every data set
head(daily_activity)
head(sleep_day)
head(weight)
head(hourly_steps)
sum(is.na(daily_activity))
## [1] 0
sum(is.na(sleep_day))
## [1] 0
sum(is.na(weight))
## [1] 65
sum(is.na(hourly_steps))
## [1] 0
sleep_day have 3 duplicates
sum(duplicated(daily_activity))
## [1] 0
sum(duplicated(sleep_day))
## [1] 3
sum(duplicated(weight))
## [1] 0
sum(duplicated(hourly_steps))
## [1] 0
daily activity extra 3 users ,sleep day less 6 users, weight less 22 users
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(sleep_day$Id)
## [1] 24
n_distinct(weight$Id)
## [1] 8
n_distinct(hourly_steps$Id)
## [1] 33
Romove duplicates from sleep_day
sleep_day <- sleep_day[!duplicated(sleep_day), ]
#check
sum(duplicated(sleep_day))
## [1] 0
Convert Activity Date into date format and add a column for day of the week
Add column for hours in hourly_steps
hourly_steps$ActivityHour <- as.POSIXct(hourly_steps$ActivityHour,format="%m/%d/%Y %I:%M:%S %p")
hourly_steps$hour <- format(hourly_steps$ActivityHour,format=('%H'))
head(hourly_steps)
We will classify the average user’s sleep into three patterns
sleep_day_new <- sleep_day %>% group_by(Id) %>%
summarise(avg_time_sleep= mean(TotalMinutesAsleep)) %>%
mutate(Categores=case_when(
avg_time_sleep < 300 ~ "Unhealthy Sleep",
avg_time_sleep >= 320 & avg_time_sleep <= 420 ~ "Normal Sleep",
avg_time_sleep > 420 ~ "Healthy Sleep"))
We will separate observations into fitness groups based on walking lifestyle: “Sedentary, Needs Improvment, Active, Highly Active”.
Steps_categores <- daily_activity %>% group_by(Id) %>%
summarise(avg_steps=mean(TotalSteps)) %>%
mutate(level_steps= case_when(
avg_steps < 5000 ~ "Sedentary",
avg_steps >=5000 & avg_steps < 10000 ~ "Needsimprovment",
avg_steps >=10000 & avg_steps < 12500 ~ "Active",
avg_steps >=12500 ~ "Highly active"))
marge data table ()
merged1 <- merge(daily_activity,sleep_day,by = c("Id"),all= TRUE)
data_marged <- merge(merged1,weight,by = c("Id"),all = TRUE)
Convert Activity Date into date format and add a column for day of the week
data_marged <- data_marged %>%
mutate( weekday = weekdays(as.Date(ActivityDate,"%m/%d/%Y")))
#Arrange the days of the week to use in chart
data_marged$weekday <- factor(data_marged$weekday, levels= c("Sunday","Monday","Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
Statistics summary mean, median, min, max for Data_marged
data_marged %>%
dplyr:: select(TotalDistance,TotalMinutesAsleep,TotalSteps,TotalTimeInBed,Calories,weekday,WeightPounds,BMI) %>%
summary()
## TotalDistance TotalMinutesAsleep TotalSteps TotalTimeInBed
## Min. : 0.000 Min. : 58.0 Min. : 0 Min. : 61.0
## 1st Qu.: 3.910 1st Qu.:400.0 1st Qu.: 5832 1st Qu.:421.0
## Median : 6.820 Median :442.0 Median :10199 Median :457.0
## Mean : 6.415 Mean :433.8 Mean : 9373 Mean :458.2
## 3rd Qu.: 8.350 3rd Qu.:477.0 3rd Qu.:12109 3rd Qu.:510.0
## Max. :28.030 Max. :796.0 Max. :36019 Max. :961.0
## NA's :971 NA's :971
## Calories weekday WeightPounds BMI
## Min. : 0 Sunday :5610 Min. :116.0 Min. :21.45
## 1st Qu.:1850 Monday :5609 1st Qu.:134.9 1st Qu.:23.89
## Median :2046 Tuesday :7004 Median :135.6 Median :24.00
## Mean :2103 Wednesday:6988 Mean :139.6 Mean :24.42
## 3rd Qu.:2182 Thursday :6930 3rd Qu.:136.7 3rd Qu.:24.21
## Max. :4900 Friday :5632 Max. :294.3 Max. :47.54
## Saturday :5616 NA's :8881 NA's :8881
Let’s look at how active users are per hour in total steps. From 5 p.m. to 7 p.m. users take the most steps
ggplot(data= hourly_steps,aes(x=hour, y=StepTotal,fill=hour))+geom_bar(stat = "identity")+ labs(title = "Steps by Hour",x="Hours",Y="Steps")
Let’s look at categories users are per steps .54.5% from users Needs
Improvment
plot_ly(Steps_categores,labels= ~level_steps, value=~avg_steps,type = 'pie', textposition = 'outside',textinfo = 'label+percent') %>%
layout(title= 'Users categories by steps')
ggplot(data = data_marged, aes(x=weekday,y=TotalSteps))+geom_bar(stat="identity",fill='steelblue')+labs(title= 'Weeklyday Steps',x="Weekday",y="Steps")
Let’s look at categories users are per average sleep hours .
sleep_day_new$Categores <- factor(sleep_day_new$Categores, levels= c("Unhealthy Sleep","Normal Sleep","Healthy Sleep"))
plot_ly(sleep_day_new,labels= ~ Categores, value= ~ avg_time_sleep,type = "pie", textposition = 'outside',textinfo = 'label+percent') %>%
layout(title='usere categories by hourly sleep')
The more active you are, the more steps you take, and the more calories you’ll burn. This is an obvious fact, but we can still look at the data to find anything interesting. Here we see that some users have similar weights, but some of them burn more than 2500 calories and nearly 20,000 steps to reach a weight of 60 kg, others only need to burn more than 1500 calories and only about 10,000 steps, and there are those who weigh up to 120 kg and they can burn over 2000 calories with much fewer steps even less than 5000 steps
ggplot(data=data_marged,aes(x=TotalSteps,y=Calories,color=WeightKg))+ geom_point()+stat_smooth(method = lm)+
scale_color_gradient(low = 'green',high = 'red')+labs(title = "Steps vs Calories",x="Steps",y="Calories")
## `geom_smooth()` using formula 'y ~ x'
And about the rumor that a person who sleeps a lot burns fewer calories,
we note that there is no significant correlation between sleeping more
and burning fewer calories.
ggplot(data=data_marged,aes(x=Calories,y=TotalMinutesAsleep,color=TotalMinutesAsleep))+ geom_point()+stat_smooth(method = lm)+
scale_color_gradient(low = 'green',high = 'red')+labs(title = "Sleep Vs Calories",X="Calories" ,y="Sleep Minutes")
## `geom_smooth()` using formula 'y ~ x'